Steps
clean names and select necessary variables
rename the variables
drop any rows with missing observations
filter out the second row (contains comments about the columns)
join all the datasets
drop any rows with missing observations
create new column, id, using the geo id to help with matching to zipcodes # idk if need to do this actually
filter to only keep counties relevant to NYC
create new column for zipcode by matching to id
Crowding data
crowding18 =
read_csv("Data/crowding/2018_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e) |>
rename(
less_than_1_y18 = s2501_c01_006e,
bet_1to1.5_y18 = s2501_c01_007e,
more_than_1.5_y18 = s2501_c01_008e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y18","bet_1to1.5_y18","more_than_1.5_y18"), as.numeric)
## Rows: 4919 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): GEO_ID, NAME, S2501_C01_001E, S2501_C01_001M, S2501_C01_002E, S25...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding19 =
read_csv("Data/crowding/2019_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e) |>
rename(
less_than_1_y19 = s2501_c01_006e,
bet_1to1.5_y19 = s2501_c01_007e,
more_than_1.5_y19 = s2501_c01_008e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y19","bet_1to1.5_y19","more_than_1.5_y19"), as.numeric)
## Rows: 4919 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): GEO_ID, NAME, S2501_C01_001E, S2501_C01_001M, S2501_C01_002E, S25...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding20 =
read_csv("Data/crowding/2020_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e) |>
rename(
less_than_1_y20 = s2501_c01_006e,
bet_1to1.5_y20 = s2501_c01_007e,
more_than_1.5_y20 = s2501_c01_008e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y20","bet_1to1.5_y20","more_than_1.5_y20"), as.numeric)
## Rows: 5412 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): GEO_ID, NAME, S2501_C01_001E, S2501_C01_001M, S2501_C01_002E, S25...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding21 =
read_csv("Data/crowding/2021_crowding.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s2501_c01_006e, s2501_c01_007e, s2501_c01_008e) |>
rename(
less_than_1_y21 = s2501_c01_006e,
bet_1to1.5_y21 = s2501_c01_007e,
more_than_1.5_y21 = s2501_c01_008e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_than_1_y21","bet_1to1.5_y21","more_than_1.5_y21"), as.numeric)
## Rows: 5412 Columns: 458
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (458): GEO_ID, NAME, S2501_C01_001E, S2501_C01_001M, S2501_C01_002E, S25...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding_10s =
full_join(crowding18, crowding19, by = "geo_id")
crowding_20s =
full_join(crowding20, crowding21, by = "geo_id")
crowding_all =
full_join(crowding_10s, crowding_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County"," New York County"))
# can I make a map using geo id?
# my time plot cuts the year in half
# time plot title is cut off
Education Data
edu18 =
read_csv("Data/edu/edu_2018.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e) |>
rename(
less_9_y18 = s1501_c01_007e,
no_hs_diploma_y18 = s1501_c01_008e,
hs_grad_y18 = s1501_c01_009e,
some_college_y18 = s1501_c01_010e,
associate_y18 = s1501_c01_011e,
bachelor_y18 = s1501_c01_012e,
graduate_y18 = s1501_c01_013e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y18","no_hs_diploma_y18","hs_grad_y18","some_college_y18","associate_y18",
"bachelor_y18","graduate_y18"), as.numeric) |>
mutate(
hs_or_less_y18 = rowSums(across(c(less_9_y18,no_hs_diploma_y18,hs_grad_y18))),
college_y18 = rowSums(across(c(some_college_y18,associate_y18,bachelor_y18,graduate_y18)))
)
## Rows: 4919 Columns: 770
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (770): GEO_ID, NAME, S1501_C01_001E, S1501_C01_001M, S1501_C01_002E, S15...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
edu19 =
read_csv("Data/edu/edu_2019.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e) |>
rename(
less_9_y19 = s1501_c01_007e,
no_hs_diploma_y19 = s1501_c01_008e,
hs_grad_y19 = s1501_c01_009e,
some_college_y19 = s1501_c01_010e,
associate_y19 = s1501_c01_011e,
bachelor_y19 = s1501_c01_012e,
graduate_y19 = s1501_c01_013e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y19","no_hs_diploma_y19","hs_grad_y19","some_college_y19","associate_y19",
"bachelor_y19","graduate_y19"), as.numeric) |>
mutate(
hs_or_less_y19 = rowSums(across(c(less_9_y19,no_hs_diploma_y19,hs_grad_y19))),
college_y19 = rowSums(across(c(some_college_y19,associate_y19,bachelor_y19,graduate_y19)))
)
## Rows: 4919 Columns: 770
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (770): GEO_ID, NAME, S1501_C01_001E, S1501_C01_001M, S1501_C01_002E, S15...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
edu20 =
read_csv("Data/edu/edu_2020.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e) |>
rename(
less_9_y20 = s1501_c01_007e,
no_hs_diploma_y20 = s1501_c01_008e,
hs_grad_y20 = s1501_c01_009e,
some_college_y20 = s1501_c01_010e,
associate_y20 = s1501_c01_011e,
bachelor_y20 = s1501_c01_012e,
graduate_y20 = s1501_c01_013e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y20","no_hs_diploma_y20","hs_grad_y20","some_college_y20","associate_y20",
"bachelor_y20","graduate_y20"), as.numeric) |>
mutate(
hs_or_less_y20 = rowSums(across(c(less_9_y20,no_hs_diploma_y20,hs_grad_y20))),
college_y20 = rowSums(across(c(some_college_y20,associate_y20,bachelor_y20,graduate_y20)))
)
## Rows: 5412 Columns: 770
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (770): GEO_ID, NAME, S1501_C01_001E, S1501_C01_001M, S1501_C01_002E, S15...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
edu21 =
read_csv("Data/edu/edu_2021.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1501_c01_007e, s1501_c01_008e, s1501_c01_009e, s1501_c01_010e, s1501_c01_011e,
s1501_c01_012e, s1501_c01_013e) |>
rename(
less_9_y21 = s1501_c01_007e,
no_hs_diploma_y21 = s1501_c01_008e,
hs_grad_y21 = s1501_c01_009e,
some_college_y21 = s1501_c01_010e,
associate_y21 = s1501_c01_011e,
bachelor_y21 = s1501_c01_012e,
graduate_y21 = s1501_c01_013e
) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("less_9_y21","no_hs_diploma_y21","hs_grad_y21","some_college_y21","associate_y21",
"bachelor_y21","graduate_y21"), as.numeric) |>
mutate(
hs_or_less_y21 = rowSums(across(c(less_9_y21,no_hs_diploma_y21,hs_grad_y21))),
college_y21 = rowSums(across(c(some_college_y21,associate_y21,bachelor_y21,graduate_y21)))
)
## Rows: 5412 Columns: 770
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (770): GEO_ID, NAME, S1501_C01_001E, S1501_C01_001M, S1501_C01_002E, S15...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
edu_10s =
full_join(edu18, edu19, by = "geo_id")
edu_20s =
full_join(edu20, edu21, by = "geo_id")
edu_all =
full_join(edu_10s, edu_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County"," New York County"))
Poverty Data
poverty18 =
read_csv("Data/poverty/2018_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, s1701_c03_001e) |>
rename(
below_poverty_y18 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y18"), as.numeric)
## Rows: 4919 Columns: 368
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (368): GEO_ID, NAME, S1701_C01_001E, S1701_C01_001M, S1701_C01_002E, S17...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
poverty19 =
read_csv("Data/poverty/2019_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y19 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y19"), as.numeric)
## Rows: 4919 Columns: 368
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (368): GEO_ID, NAME, S1701_C01_001E, S1701_C01_001M, S1701_C01_002E, S17...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
poverty20 =
read_csv("Data/poverty/2020_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y20 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y20"), as.numeric)
## Rows: 5412 Columns: 368
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (368): GEO_ID, NAME, S1701_C01_001E, S1701_C01_001M, S1701_C01_002E, S17...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
poverty21 =
read_csv("Data/poverty/2021_poverty.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, s1701_c03_001e) |>
rename(
below_poverty_y21 = s1701_c03_001e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("below_poverty_y21"), as.numeric)
## Rows: 5412 Columns: 374
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (374): GEO_ID, NAME, S1701_C01_001E, S1701_C01_001M, S1701_C01_002E, S17...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
poverty_10s =
full_join(poverty18, poverty19, by = "geo_id")
poverty_20s =
full_join(poverty20, poverty21, by = "geo_id")
poverty_all =
full_join(poverty_10s, poverty_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County"," New York County"))
Vacancy Data
vacancy18 =
read_csv("Data/vacancy/2018_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, name, b25004_001e, b25004_008e) |>
rename(
total_home_y18 = b25004_001e,
vacant_y18 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y18","vacant_y18"), as.numeric) |>
mutate(prop_vacant_y18 = vacant_y18/total_home_y18)
## Rows: 4919 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): GEO_ID, NAME, B25004_001E, B25004_001M, B25004_002E, B25004_002M, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
vacancy19 =
read_csv("Data/vacancy/2019_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y19 = b25004_001e,
vacant_y19 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y19","vacant_y19"), as.numeric) |>
mutate(prop_vacant_y19 = vacant_y19/total_home_y19)
## New names:
## Rows: 4919 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): GEO_ID, NAME, B25004_001E, B25004_001M, B25004_002E, B25004_002M, ... lgl
## (1): ...19
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...19`
vacancy20 =
read_csv("Data/vacancy/2020_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y20 = b25004_001e,
vacant_y20 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y20","vacant_y20"), as.numeric) |>
mutate(prop_vacant_y20 = vacant_y20/total_home_y20)
## New names:
## Rows: 5412 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): GEO_ID, NAME, B25004_001E, B25004_001M, B25004_002E, B25004_002M, ... lgl
## (1): ...19
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...19`
vacancy21 =
read_csv("Data/vacancy/2021_vacancy.csv", col_names = TRUE) |>
janitor::clean_names() |>
select(geo_id, b25004_001e, b25004_008e) |>
rename(
total_home_y21 = b25004_001e,
vacant_y21 = b25004_008e) |>
drop_na() |>
filter(!row_number() %in% c(1)) |>
mutate_at(c("total_home_y21","vacant_y21"), as.numeric) |>
mutate(prop_vacant_y21 = vacant_y21/total_home_y21)
## New names:
## Rows: 5412 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): GEO_ID, NAME, B25004_001E, B25004_001M, B25004_002E, B25004_002M, ... lgl
## (1): ...19
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...19`
vacancy_10s =
full_join(vacancy18, vacancy19, by = "geo_id")
vacancy_20s =
full_join(vacancy20, vacancy21, by = "geo_id")
vacancy_all =
full_join(vacancy_10s, vacancy_20s, by = "geo_id") |>
drop_na() |>
mutate(
id = str_sub(geo_id, 10),
county = str_extract(name, "(?<=,)[^,]+(?=,)")
) |>
filter(county %in% c(" Kings County"," Bronx County"," Queens County"," Richmond County"," New York County"))
It should be noted that the dataset included information about other types of properties but were unoccupied such as rental properties, those listed for sale, properties sold but unoccupied, those designated for migrant workers, and residences that were used occasionally/seasonally/recreationally. The only variable considered in this analysis was vacant properties however it is likely that residences that go unoccupied for several months can become nests for the rat population, increasing sightings in those areas. A more thorough and in-depth analysis should include the other variables provided in the dataset.
In general, might be losing information with the variables chosen for analysis. During data collection, the variables were further subsetted into age groups and race/ethnicity designations. However for this analysis, the general estimates were taken for the population as a whole in NYC.
When importing data from 2018 and 2019, there were 4918 census tracts and when importing data from 2020 and 2021, there were 5411 census tracts. The datasets for each year were joined together by geography id to only retain census tracts that were consistent across the years. There may have been adjustments or redfining areas but that could affect analysis.
Plots:
crowding_time = crowding_all |>
pivot_longer(less_than_1_y18:more_than_1.5_y21,
names_to = "occupancy",
values_to = "count") |>
mutate(year = str_sub(occupancy, -3),
category = str_sub(occupancy, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
filter(category == "more_than_1.5") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Number of Housing Units with 1.51 or More Occupants/Room by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Number of Housing Units'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
crowding_time
Analysis
edu_time = edu_all |>
pivot_longer(less_9_y18:college_y21,
names_to = "education_level",
values_to = "count") |>
mutate(year = str_sub(education_level, -3),
category = str_sub(education_level, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
filter(category == "hs_or_less") |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Number of Individuals with HS or less Education Level by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Number of Individuals'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
edu_time
Analysis
poverty_time = poverty_all |>
pivot_longer(below_poverty_y18:below_poverty_y21,
names_to = "poverty_level",
values_to = "count") |>
mutate(year = str_sub(poverty_level, -3)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Number of Housing Units Below the Poverty Level by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Number of Households Below Poverty Level'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
poverty_time
Analysis
vacancy_time = vacancy_all |>
pivot_longer(total_home_y18:prop_vacant_y21,
names_to = "vacancy",
values_to = "count") |>
mutate(year = str_sub(vacancy, -3),
category = str_sub(vacancy, end=-5)) |>
mutate(
year = case_match(
year,
"y18" ~ 2018,
"y19" ~ 2019,
"y20" ~ 2020,
"y21" ~ 2021
)
) |>
group_by(county,year) |>
summarise(mean = mean(count)) |>
plot_ly(x = ~year, y = ~mean, color = ~county, type = "scatter", mode = "lines",
colors = "viridis") |>
layout(title = "Timeseries of Average Proportion of Vacant Properties by Borough in NYC",
plot_bgcolor = "e5ecf6",
xaxis = list(autotypenumbers = 'strict', title = 'Year'),
yaxis = list(title = 'Average Proportion'))
## `summarise()` has grouped output by 'county'. You can override using the
## `.groups` argument.
vacancy_time
Analysis
make distribution plots (if possible) otherwise do barcharts comparing the different boroughs for variable of interest for 2021
explain how each of these variables may be related to rat sightings and increased presence, go back to the rat paper possibly compare to the rat distribution
upload everything to shared repository